####################################################################
## author:    A. D?r, Robert A. Huber, Yannick Stiller
## contact:   robert.huber@sbg.ac.at
## file name: MP_data_checks.R
## Context:   Understanding MP Trade Preferences
## started:   2019-07-11
## Summary:   Tests MP data for plausibility
######################################################################

#. Preparation ####

#clear
rm(list=ls())

#set working dir
getwd()
# the replication_materials_LSQ.RProj should automatically set the working direction to this folder

#useful function for loop
'%!in%' <- function(x,y)!('%in%'(x,y))

#packages
library(tidyverse)

#read data

#read individual Data
df <- readRDS("./data/df.rds")
#read national Data
national <- readRDS("./data/national.rds")


#### National Manufacturing Share #####

#National Values from our survey data
national <- subset(national, abs(merge_year - survey_year)<3)
national$country_survey_year <- paste0(national$country_short, "_", national$survey_year)
national <- national[,c("country_survey_year", "share_manu", "share_agri", "share_serv", "share_rest")]
national <- unique(national)

#National shares in sectors from external reference categories

# Note: 
#  data source with ILO estimates in years without data:
# https://www.ilo.org/shinyapps/bulkexplorer1/?lang=en&segment=indicator&id=EMP_2EMP_SEX_ECO_NB_A


#Load and prepare ILO data

ilo <- readstata13::read.dta13("./data/employment_by_country_ilo.dta")

ilo <- ilo[,c(1,9,11:12)]
colnames(ilo) <- c("country", "classification", "year", "ilo_value")

ilo$country_year <- paste0(ilo$country, "_", ilo$year)
ilo <- ilo[,c("country_year", "classification", "ilo_value")]

ilo$classification <- gsub(".*_","",ilo$classification)

ilo <- ilo %>%
  spread(classification, ilo_value) %>%
  mutate(ilo_manu = (C)/TOTAL, ilo_agri = (A + B)/TOTAL, ilo_serv = (DE + F + G + HJ + I + K + LMN + O + P + Q+ RSTU)/TOTAL , ilo_rest = (DE)/TOTAL) %>%
  dplyr::select(., c(country_year, ilo_manu, ilo_agri, ilo_serv, ilo_rest))

national <- merge(national, ilo, by.x = "country_survey_year", by.y= "country_year", all.x = T)
national$difference_manu <- abs(national$share_manu - national$ilo_manu)
national$difference_agri <- abs(national$share_agri - national$ilo_agri)
national$difference_serv <- abs(national$share_serv - national$ilo_serv)
national$difference_rest <- abs(national$share_rest - national$ilo_rest)

national$country_year 

# correlation between all three measures
cor.test(national$share_manu, national$ilo_manu)
cor.test(national$share_agri, national$ilo_agri)
cor.test(national$share_serv, national$ilo_serv)
cor.test(national$share_rest, national$ilo_rest)

#Figures

ggplot(national, aes(x = share_manu*100, y = ilo_manu*100, label = country_survey_year)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1) +
  ggthemes::theme_tufte() +
  ggrepel::geom_text_repel(aes(label=country_survey_year),size=2.5) +
  coord_cartesian(xlim = c(7, 18), ylim = c(7, 18)) +
  labs(x = "Share of manufacturing in Competitiveness Data\nin %",
       y = "Share of manufacturing in ILO Data\nin %") +
  theme(text = element_text(size = 12)) +
  NULL
  
ggsave(filename = "./figures/FigureF3.pdf",
       width = 17, height = 12, units = "cm")

ggplot(national, aes(x = share_agri*100, y = ilo_agri*100, label = country_survey_year)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1) +
  ggthemes::theme_tufte() +
  ggrepel::geom_text_repel(aes(label=country_survey_year),size=2.5) +
  coord_cartesian(xlim = c(0, 40), ylim = c(0, 40)) +
  labs(x = "Share of agriculture in Competitiveness Data\nin %",
       y = "Share of agriculture in ILO Data\nin %") +
  theme(text = element_text(size = 12)) +
  NULL

ggsave(filename = "./figures/FigureF4.pdf",
       width = 17, height = 12, units = "cm")

ggplot(national, aes(x = share_serv*100, y = ilo_serv*100, label = country_survey_year)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1) +
  ggthemes::theme_tufte() +
  ggrepel::geom_text_repel(aes(label=country_survey_year),size=2.5) +
  coord_cartesian(xlim = c(48, 85), ylim = c(48, 85)) +
  labs(x = "Share of service in Competitiveness Data\nin %",
       y = "Share of service in ILO Data\nin %") +
  theme(text = element_text(size = 12)) +
  NULL

ggsave(filename = "./figures/FigureF5.pdf",
       width = 17, height = 12, units = "cm")

ggplot(national, aes(x = share_rest*100, y = ilo_rest*100, label = country_survey_year)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1) +
  ggthemes::theme_tufte() +
  ggrepel::geom_text_repel(aes(label=country_survey_year),size=2.5) +
  coord_cartesian(xlim = c(0, 4), ylim = c(0, 4)) +
  labs(x = "Share of other activities in Competitiveness Data\nin %",
       y = "Share of other activities in ILO Data\nin %") +
  theme(text = element_text(size = 12)) +
  NULL

ggsave(filename = "./figures/FigureF6.pdf",
       width = 17, height = 12, units = "cm")

rm(list = setdiff(ls(), c("df")))

#### Subnational Population #####


# pop.RData stores the population data from the labor surveys used to calculate the competitiveness measure
load("./data/pop.RData")

# popWB.rds stores the population data from the World Bank, which we use as true population data and baseline
popWB <- readRDS("./data/popWB.rds")

pop <- merge(pop, popWB, by.x = c("ISO_3166_2", "country_short", "survey_year"), by.y = c("ISO_3166_2", "country_code", "year"), all.x=T)

pop$population <- as.numeric(as.character(pop$population))/10^6
pop$value <- pop$value/10^6

pop$country_year <- paste0(pop$country_short, "_", pop$survey_year)

pop[which(pop$ISO_3166_2 == "PE-LIM"),]$population <- pop[which(pop$ISO_3166_2 == "PE-LIM"),]$population + pop[which(pop$ISO_3166_2 == "PE-LMA"),]$population
#WB Data does not distinuish between Lima Province and Lima City, as does our data. Thus, we combined both.

pop <- subset(pop, value != 0)# & abs(merge_year - survey_year) < 3)

pop$country_year_MP <- paste0(pop$country_short, "-", pop$merge_year)

df_loop <- data.frame(country_year = unique(paste0(df$country_short, "-", df$merge_year)))

df_loop$country <- substr(df_loop$country_year, 1,3)

df_loop$figure_n <- as.numeric(factor(df_loop$country)) + 6

df_loop <- df_loop %>% 
  arrange(country_year) %>% 
  group_by(country) %>% 
  mutate(sub = 1:n()) %>% 
  ungroup() %>% 
  mutate(lab = paste0("FigureF", figure_n, letters[sub]))

for (i in unique(df_loop$country_year)){
  
  ggplot(subset(pop, country_year_MP == i), aes(x = population, y = value)) +
    geom_point() + 
    geom_smooth(method = "lm", se = F) +
    ggthemes::theme_tufte() +
    labs(x = "Population by district in our data\nin m",
         y = "Population by district\nWorldbank data\nin m")+
     #ggrepel::geom_text_repel(aes(label=ISO_3166_2),size=4, max.overlaps = 50) +
    theme(text = element_text(size = 12)) +
    NULL
  
  fig_lab <- df_loop[which(i == df_loop$country_year),]$lab
  
  ggsave(filename = paste0("./figures/", fig_lab, ".pdf"),
         width = 17, height = 9, units = "cm")
  
}

#### Representativeness of legislators ####

df_rep <- df

# Exclude surveys without relevant question in questionnaire
df_rep <- subset(df_rep, !(country_year %in% c("BRA-2007", "MEX-2003", "SLV-2003")))

# In Argentina, Survey time and legislative periods differ because of Argentina's staggered electoral system
df_rep$country_year <- ifelse(df_rep$period %in% c("05-09", "07-09", "07-11") & df_rep$country == "Argentina", "ARG-2007", df_rep$country_year)
df_rep$country_year <- ifelse(df_rep$period %in% c("711", "09-13") & df_rep$country == "Argentina", "ARG-2009", df_rep$country_year)
df_rep$country_year <- ifelse(df_rep$period %in% c("2009-20013", "2011-2015") & df_rep$country == "Argentina", "ARG-2011", df_rep$country_year)

# In Panama, the province "Panama_Oueste" already existed at the time of the election but apparently, the election was still based on the old administrative divisions
df_rep$district_clean <- ifelse(df_rep$district_clean == "Panama_Oeste", "Panama", df_rep$district_clean)

#### Parties ####

party_seats <- readxl::read_excel("./data/Election_Data.xlsx", sheet = "Parties", trim_ws = F)
party_seats$country_code <- countrycode::countrycode(party_seats$country, origin = "country.name", destination = "iso3c")
party_seats$election_year <- paste0(party_seats$country_code, "-", party_seats$year)

# The survey year is the start of the legislative period, which often differs from the election year
party_seats$start_year <- ifelse(party_seats$election_year %in% c("BOL-2005", "BOL-2009", "BOL-2014", "CHL-2005", "CHL-2009",
                                                                  "CHL-2013", "GTM-2007", "GTM-2011", "GTM-2016", "URY-2014", "VEN-2015",
                                                                  "NIC-2006", "HND-2009", "HND-2013", "HND-2005", "NIC-2011", "NIC-2016"), 
                                 party_seats$year + 1, party_seats$year)

party_seats$country_year <- paste0(party_seats$country_code, "-", party_seats$start_year)
party_seats$party_short <- ifelse(party_seats$seat < 5, "Otros", as.character(party_seats$party_short))
party_seats <- party_seats[,c("country_year", "party_short", "seat")]
#Note: I dropped party name, as this is not optimal to group with


parties_election <- party_seats %>%
  group_by(country_year, party_short) %>%
  summarize(party_seats = sum(seat, na.rm=T)) %>%
  mutate(party_short = paste0(substr(country_year, 1,3), "_", party_short))

parties_survey <- df_rep %>%
  group_by(country_year, party_short) %>%
  summarize(party_respondents_us = sum(!is.na(us_agreement)),
            party_respondents = sum(!is.na(party)))

parties <- merge(parties_survey, parties_election, by = c("country_year", "party_short"), all.x = T)
parties$share <- parties$party_respondents/parties$party_seats
# Note: The share of respondents over actual seats exceeds 1 only for "other" in three waves. This is most likely due to legislators leaving their parties between the election and the survey date.
table(subset(as.character(parties$party_short), parties$share > 1))
hist(parties$share)
hist(subset(parties$share, parties$share <= 1))
summary(parties$share)
# Note: One party in Ecuador is missing because it did not stand in the election and seems to have formed during the legislation period. Was recoded to "other"
table(subset(as.character(parties$party_name), is.na(parties$party_seats)))
table(subset(as.character(parties$country_year), is.na(parties$party_seats)))
cor.test(parties$party_respondents, parties$party_seats)

ggplot(parties, aes(x = party_seats, y = party_respondents, label = country_year)) +
  geom_point(size = 1, position=position_jitter(h=0.25,w=0.25)) + 
  geom_abline(intercept = 0, slope = 1, colour = "black", lty = "dashed") +
  geom_smooth(method = "lm", se = F, colour = "black") +
  ggthemes::theme_tufte() +
  ggtitle("Share of legislators who responded to survey by party") + 
  labs(x="Seats in Parliament", y = "Respondents in Survey") +
  theme(text = element_text(size = 12)) +
  NULL

ggsave(filename = "./figures/FigureF2.pdf",
       width = 17, height = 12, units = "cm")

# The Mexican parliament is much larger than all others and thus, the Mexican parties are big outliers.
parties_no_mex <- subset(parties, !(country_year %in% c("MEX-2006", "MEX-2009")))
ggplot(parties_no_mex, aes(x = party_seats, y = party_respondents, label = country_year)) +
  geom_point(size = 1, position=position_jitter(h=0.25,w=0.25)) + 
  geom_abline(intercept = 0, slope = 1, colour = "black", lty = "dashed") +
  geom_smooth(method = "lm", se = F, colour = "black") +
  ggthemes::theme_tufte() +
  ggtitle("Share of legislators who responded to survey by party (excl. Mexico)") + 
  labs(x="Seats in Parliament", y = "Respondents in Survey") +
  theme(text = element_text(size = 12)) +
  NULL


#### Districts ####

district_seats <- readxl::read_excel("./data/Election_Data.xlsx", sheet = "Districts", trim_ws = F)
district_seats$country_code <- countrycode::countrycode(district_seats$country, origin = "country.name", destination = "iso3c")
district_seats$election_year <- paste0(district_seats$country_code, "-", district_seats$year)

# The survey year is the start of the legislative period, which often differs from the election year
district_seats$start_year <- ifelse(district_seats$election_year %in% c("BOL-2005", "BOL-2009", "BOL-2014", "CHL-2005", "CHL-2009",
                                                                        "CHL-2013", "GTM-2007", "GTM-2011", "GTM-2015", "URY-2014", "VEN-2015",
                                                                        "NIC-2006", "HND-2009", "HND-2013", "HND-2005", "NIC-2011", "NIC-2016"), 
                                    district_seats$year + 1, district_seats$year)

district_seats$country_year <- paste0(district_seats$country_code, "-", district_seats$start_year)
district_seats <- district_seats[,c("country", "country_year", "district", "seat")]

# There is no information available for MEX-2009 and DOM-2010. Therefore, we use data of preceding election. 
mex2009 <- subset(district_seats, country_year == "MEX-2006")
mex2009$country_year <- "MEX-2009"
mex2012 <- subset(district_seats, country_year == "MEX-2006")
mex2012$country_year <- "MEX-2012"
mex2015 <- subset(district_seats, country_year == "MEX-2006")
mex2015$country_year <- "MEX-2015"
mex2018 <- subset(district_seats, country_year == "MEX-2006")
mex2018$country_year <- "MEX-2018"
dom2010 <- subset(district_seats, country_year == "DOM-2006")
dom2010$country_year <- "DOM-2010"

district_seats <- rbind(district_seats, mex2009, mex2012, mex2015, mex2018, dom2010)
rm(list = c("mex2009", "dom2010", "mex2012", "mex2015", "mex2018"))

ISO <- readxl::read_xlsx("./data/district_final.xlsx")
ISO <- unique(ISO[,c("country", "district_clean", "ISO_3166_2")])
district_seats <- merge(district_seats, ISO, by.x = c("country", "district"), by.y = c("country", "district_clean"), all.x = T)

districts_election <- district_seats %>%
  group_by(country_year, district, ISO_3166_2) %>%
  summarize(district_seats = sum(seat))

districts_survey <- df_rep %>%
  group_by(country_year, district_clean) %>%
  summarize(district_respondents_us = sum(!is.na(us_agreement)),
            district_respondents = sum(!is.na(party)))

districts <- merge(districts_survey, districts_election, by.x = c("country_year", "district_clean"), by.y = c("country_year", "district"), all = T)

# replace no respondent by 0
districts$district_respondents <- ifelse(is.na(districts$district_respondents), 0, districts$district_respondents)

districts$share <- districts$district_respondents/districts$district_seats

table(subset(as.character(districts$district_clean), districts$share > 1))
hist(districts$share)
hist(subset(districts$share, districts$share <= 1))
summary(districts$share)
table(subset(as.character(districts$district_clean), is.na(districts$district_seats)))
table(subset(as.character(districts$country_year), is.na(districts$district_seats)))
cor.test(districts$district_respondents, districts$district_seats)

ggplot(districts, aes(x = district_seats, y = district_respondents, label = country_year)) +
  geom_point(size = 1, position=position_jitter(h=0.25,w=0.25)) + 
  geom_abline(intercept = 0, slope = 1, colour = "black", lty = "dashed") +
  geom_smooth(method = "lm", se = F, colour = "black") +
  ggthemes::theme_tufte() +
  ggtitle("Share of legislators who responded to survey by district") + 
  labs(x="Seats in Parliament", y = "Respondents in Survey") +
  theme(text = element_text(size = 12)) +
  NULL

ggsave(filename = "./figures/FigureF1.pdf",
       width = 17, height = 12, units = "cm")

districts <- districts[,c("country_year", "district_clean", "district_seats")]
names(districts)[3] <- "district_magnitude"
districts <- unique(districts)

df <- dplyr::left_join(df, districts, by = c("country_year", "district_clean"))

#### Parliaments ####

parliament_seats <- readxl::read_excel("./data/Election_Data.xlsx", sheet = "Parliaments", trim_ws = F)
parliament_seats$country_code <- countrycode::countrycode(parliament_seats$country, origin = "country.name", destination = "iso3c")
parliament_seats$election_year <- paste0(parliament_seats$country_code, "-", parliament_seats$year)

# The survey year is the start of the legislative period, which often differs from the election year
parliament_seats$start_year <- ifelse(parliament_seats$election_year %in% c("BOL-2005", "BOL-2009", "BOL-2014", "CHL-2005", "CHL-2009",
                                                                        "CHL-2013", "GTM-2007", "GTM-2011", "GTM-2015", "URY-2014", "VEN-2015",
                                                                        "NIC-2006", "HND-2009", "HND-2013", "HND-2005", "NIC-2011", "NIC-2016"), 
                                      parliament_seats$year + 1, parliament_seats$year)

parliament_seats$country_year <- paste0(parliament_seats$country_code, "-", parliament_seats$start_year)
parliament_seats <- parliament_seats[,c("country", "country_year", "seats")]

parliament_survey <- df_rep %>%
  group_by(country_year) %>%
  summarize(parliament_respondents_us = sum(!is.na(us_agreement)),
            parliament_respondents = sum(!is.na(party)))

parliaments <- merge(parliament_survey, parliament_seats, by.x = "country_year", by.y = "country_year", all = T)

parliaments$share <- parliaments$parliament_respondents/parliaments$seats
hist(parliaments$share)
summary(parliaments$share)

rm(list = setdiff(ls(), "df"))
